# insurance <- read.table("BBDD_DEFINITIVA_V4.txt", header = T)
# saveRDS(insurance, "BBDD_DEFINITIVA_V4.RDS")

insurance <- readRDS("BBDD_DEFINITIVA_V4.RDS")
insurance <- insurance %>%
  rename(
    key              = CLAVE, 
    actuarial_age    = EDAD_ACTUARIAL, 
    sus_age          = EDAD_SUS,
    duration_cat     = DURACION_CAT,
    capital          = CAPITAL,
    ) %>%
  mutate(
    period           = factor(insurance$PERIODO),
    cover            = factor(insurance$COVER),
    sex              = factor(ifelse(insurance$SEXO == 0, "man", "woman")),
    smoker           = factor(ifelse(insurance$FUMADOR == 0, "no", "yes")),
    good_health      = factor(ifelse(insurance$BUENA_SALUD == 1, "yes", "no")),
    exp              = as.numeric(gsub(",", ".", gsub("\\.", "", insurance$EXP))),
    mortality        = factor(ifelse(insurance$SINIESTRO == 1, "yes", "no")),
    duration         = as.numeric(gsub(",",".", gsub("\\.", "", insurance$DURACION))),
    capital_factor_1 = factor(insurance$CAPITAL_CAT1_DESC, 
                              levels = c("0-30.000", "30.001-60.000", 
                                         "60.001-90.000","90.001-120.000", 
                                         "120.001-150.000", "+150.000"),
                              ordered = T),
    capital_factor_2 = factor(insurance$CAPITAL_CAT2_DESC, 
                              levels = c("0-60.000", "60.001-120.000", 
                                         "+120.000"),
                              ordered = T),
    capital_factor_3 = factor(insurance$CAPITAL_CAT3_DESC, 
                              levels = c("0-100.000", "100.001-200.000", 
                                         "+200.000"),
                              ordered = T),
    IMC              = as.numeric(gsub(",", ".", gsub("\\.", "", insurance$IMC))),
    IMC_factor_1     = factor(insurance$IMC_CAT1_DESC, 
                              levels = c("Normal", "Riesgo"),
                              labels = c("normal", "risk")),
    IMC_factor_2     = factor(insurance$IMC_CAT2_DESC,
                              levels = c("Normal", "Sobrepeso"),
                              labels = c("normal", "overweight"))
  ) %>%
  
  select(
    key, period, cover, sex,  smoker, good_health, actuarial_age, sus_age, exp,
    mortality, duration, duration_cat, capital, capital_factor_1, capital_factor_2,
    capital_factor_3, IMC, IMC_factor_1, IMC_factor_2
  )

Here, we remove the repeated columns

insurance <- tibble::rowid_to_column(insurance, "ID")

# here we take the IDs that are not repeated
IDs <- insurance %>%
  arrange(desc(mortality)) %>%
  distinct_at(vars(key, period, cover), .keep_all = T) %>%
  select(ID)

# here we ensure that the rows we are removing have mortality = 0
insurance %>%
  filter(!(ID %in% IDs$ID)) %>%
  select(mortality) %>%
  summary()
##  mortality
##  no :63   
##  yes: 0
insurance <- insurance %>%
  filter(ID %in% IDs$ID)

insurance

Also, we create another df of only the people that had some sinister

# mortalityS
key_mortality <- insurance %>% # tomamos todas las claves de las personas con mort
  filter(mortality == "yes") %>%
  select(key)

insurance_mortality <- insurance %>%
  filter(key %in% key_mortality$key)

insurance_mortality

And another one with the people that drop

# lapse
# Here we take the keys of the lapse

keys_lapse <- insurance %>%
  filter((key %in% key_mortality$key) == F) %>%
  group_by(key) %>%
  count() %>%
  mutate(lapse = ifelse(n == 1 | n == 2, "yes", "no")) %>%
  filter(lapse == "yes") %>%
  select(key)

insurance_lapse <- insurance %>%
  filter((key %in% key_mortality$key) == F) %>%
  mutate(lapse = ifelse(key %in% keys_lapse$key, "yes", "no")) %>%
  arrange(key) 

for (i in 2:nrow(insurance_lapse)){
  if (insurance_lapse$lapse[i-1] == "yes") {
    if (insurance_lapse$key[i-1] == insurance_lapse$key[i]) {
      insurance_lapse$lapse[i-1] = "no"
    }
  }
}

insurance_lapse$mortality = NULL
insurance_lapse
# saveRDS(insurance, file = "insurance.RDS")
# saveRDS(insurance_mortality, file = "insurance_mortality.RDS")
# saveRDS(insurance_lapse, file = "insurance_lapse.RDS")

insurance <- readRDS("insurance.RDS")
insurance_mortality <- readRDS("insurance_mortality.RDS")
insurance_lapse <- readRDS("insurance_lapse.RDS")